In [1]:
%%HTML
<img src="home/nati/Pictures/otto_competition.JPG",width=10,height=10>
The goals of this tutorial notebook are to: a) introduce you to the process and approach for performing Exploratory Data Analysis (EDA) b) get you train various classifiers and explore their results c) use these trained models to predict the target variable (in this example dataset it is the type of a product)
lets begin with importing some common libraries we discussed about in the previous part.
In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
from sklearn.model_selection import train_test_split
my_color_map = ['green','aqua','pink','blue','red','black','yellow','teal','orange','grey']
now lets load our data set for this tutorial: the Otto dataset
In [3]:
tr_data = pd.read_csv('../input/train.csv')
te_data = pd.read_csv('../input/test.csv')
print('train shape is: {} \r\n\ test shape is: {}'.format(tr_data.shape, te_data.shape))
pandas has lots of great features that can help us get insights to the data with very little effort lets begin with exploring some statistics of the numerical features:
In [4]:
tr_data.describe()
Out[4]:
this format is somewhat problematic since:
1) when we scroll aside we notice that not all columns are presented so we cannot explore them
2) the data is very wide and we're not using the screen very efficiently
we can solve the first problem by setting some of pandas display parameters as for the screen usage - we can transpose the resulting dataframe
In [9]:
#set number of rows and columns to see
pd.options.display.max_rows = 200
pd.options.display.max_columns = 50
#use transposed view of the features
desc = tr_data.describe().T
desc['count_nonzero'] = np.count_nonzero(tr_data,axis=0)[:-1]
desc['num_uniques'] = [len(tr_data[x].unique()) for x in tr_data.columns[:-1]]
desc['uniques'] = [(tr_data[x].unique()) for x in tr_data.columns[:-1]]
desc
Out[9]:
another great feature of the pandas package is the simplisity of exploring the values distribution of the target variable & for each of the feature
In [11]:
print('the value counts of the target are:')
print(tr_data.iloc[:,-1].value_counts())
print(tr_data.iloc[:,-1].value_counts().plot(kind = 'bar'))
In [ ]:
for i,feat in enumerate(tr_data.columns[1:-1]): #we start from the second feature as the first one is the item id
print('the value counts of feature {} are:'.format(feat))
print(tr_data[feat].value_counts())
In [14]:
def value_counts_plots(dat,rows = 4, cols = 4):
_,ax = plt.subplots(rows,cols,sharey='row',sharex='col',figsize = (cols*5,rows*5))
for i,feat in enumerate(dat.columns[:(rows*cols)]):
dat[feat].value_counts().iloc[:20].plot(kind = 'bar',ax=ax[int(i/cols), int(i%cols)],title='value_counts {}'.format(feat))
value_counts_plots(tr_data.iloc[:,1:9],2,4)
In [18]:
tr_data['parsed_target'] = [int(x.split('_')[1]) for x in tr_data.target]
tr_data.drop('target',axis=1,inplace=True)
In [22]:
def target_bar_plots(dat,cols = 4, rows = 4):
_,ax = plt.subplots(rows,cols,sharey='row',sharex='col',figsize = (cols*5,rows*5))
for i,feat in enumerate(dat.columns[:(rows*cols)]):
try:
dat.pivot_table(index=['parsed_target'],values=dat.columns[i],aggfunc=np.count_nonzero).plot(
kind = 'bar' ,ax=ax[int(i/cols), int(i%cols)],title =
'non_zero values by category for {}'.format(feat))
except:
pass
target_bar_plots(tr_data,4,4)
while examining these plots we can already make some assumptions
as to which categories will be easier to predict and which will be the harder ones - can you guess?
now lets look at the test set features and check if they resemble the train features
In [23]:
tr_data['source'] = 'train'
te_data['source'] = 'test'
all_data = pd.concat([tr_data,te_data],axis=0)
tr_data.drop('source',axis=1,inplace=True)
te_data.drop('source',axis=1,inplace=True)
In [37]:
molten = pd.melt(all_data, id_vars = 'source',value_vars = ['feat_'+str(x) for x in [13,14,27]])
plt.subplots(figsize = (20,8))
sns.violinplot(data=molten, x= 'variable',y='value',hue='source',split = True,palette=my_color_map)
Out[37]:
In [41]:
from sklearn.model_selection import train_test_split
X_train, X_val, y_train, y_val = train_test_split(tr_data.iloc[:,1:-1],tr_data.parsed_target,test_size = 0.2,random_state =2017)
In [47]:
Out[47]:
In [49]:
%%time
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import confusion_matrix, log_loss
knn = KNeighborsClassifier(n_jobs=4,n_neighbors=4)
knn.fit(X_train,y_train)
knn4_pred = knn.predict(X_val)
knn4_pred_proba = knn.predict_proba(X_val)
print(confusion_matrix(y_pred=knn4_pred,y_true=y_val))
print('log loss: {}'.format(log_loss(y_pred=np.clip(knn4_pred_proba,a_max=0.999,a_min=0.001),y_true=pd.get_dummies(y_val-1))))
sns.heatmap(xticklabels=range(1,10),yticklabels=range(1,10),data = confusion_matrix(y_pred=knn4_pred,y_true=y_val),cmap='Greens')
In [50]:
from sklearn.metrics import classification_report
print('classification report results:\r\n' + classification_report(y_pred=knn4_pred,y_true=y_val))
as we can see our assamption was indeed correct - categories 6,8 and 2 are those easiest to predict
In [60]:
%%time
#this will give higher importance to successfully classifying the 4th class items
class_weights = {1:8,2:1,3:2,4:5,5:5,6:1,7:5,8:2,9:3}
from sklearn.tree import DecisionTreeClassifier
dtc = DecisionTreeClassifier(#class_weight=class_weights,
max_depth=15,max_features=92,min_samples_split=2,random_state=12345)
dtc.fit(X_train,y_train)
tree_pred = dtc.predict(X_val)
tree_pred_proba = dtc.predict_proba(X_val)
print(confusion_matrix(y_pred=tree_pred,y_true=y_val))
print('log loss: {}'.format(log_loss(y_pred=np.clip(tree_pred_proba,a_max=0.999,a_min=0.001),y_true=pd.get_dummies(y_val-1))))
sns.heatmap(confusion_matrix(y_pred=tree_pred,y_true=y_val),cmap='Greens',xticklabels=range(1,10),yticklabels=range(1,10))
print('classification report results:\r\n' + classification_report(y_pred=tree_pred,y_true=y_val))
lets see if support vector machines will do any better
In [61]:
from sklearn.svm import SVC
svc = SVC(kernel='linear',C=0.1,max_iter=100,random_state=12345)
svc.fit(X_train,y_train)
svc_pred = svc.predict(X_val)
print(confusion_matrix(y_pred=svc_pred,y_true=y_val))
sns.heatmap(confusion_matrix(y_pred=svc_pred,y_true=y_val),cmap='Greens',xticklabels=range(1,10),
yticklabels=range(1,10))
print('classification report results:\r\n' + classification_report(y_pred=svc_pred,y_true=y_val))
In [62]:
#this cell takes some time to run its not relevant for the rest of the notebook
from sklearn.preprocessing import MinMaxScaler
svc = SVC(kernel='linear',C=0.1,max_iter=10000,random_state=12345)
mms = MinMaxScaler()
mms.fit(X_train)
X_train_scaled = mms.transform(X_train)
X_val_scaled = mms.transform(X_val)
svc.fit(X_train_scaled,y_train)
svc_pred = svc.predict(X_val_scaled)
print(confusion_matrix(y_pred=svc_pred,y_true=y_val))
sns.heatmap(confusion_matrix(y_pred=svc_pred,y_true=y_val),cmap='Greens',xticklabels=range(1,10),
yticklabels=range(1,10))
print('classification report results:\r\n' + classification_report(y_pred=svc_pred,y_true=y_val))
we can see that we get less accurate results on the whole,
but we achieved better results for the class we selected to be more important
lets try ensamble learning - we'll start with random forest
In [64]:
%%time
from sklearn.ensemble import RandomForestClassifier
rfc = RandomForestClassifier(n_jobs=4,n_estimators=100)
rfc.fit(X_train,y_train)
rfc_pred = rfc.predict(X_val)
rfc_pred_proba = rfc.predict_proba(X_val)
print(confusion_matrix(y_pred=rfc_pred,y_true=y_val))
print('log loss: {}'.format(log_loss(y_pred=np.clip(rfc_pred_proba,a_max=0.999,a_min=0.001),y_true=pd.get_dummies(y_val-1))))
sns.heatmap(confusion_matrix(y_pred=rfc_pred,y_true=y_val),cmap='Greens',xticklabels=range(1,10),yticklabels=range(1,10))
print('classification report results:\r\n' + classification_report(y_pred=rfc_pred,y_true=y_val))
yes!
rf model got highest score so far with no special effort just applying fit - predict
this is logical considering the overfitting we experienced earlier with decission tree classifier
let's check if gradient boosting can further improve on that
*note that in order to make sure each single model is under fitted we limit the depth of the tree to 6
In [68]:
%%time
from sklearn.ensemble import GradientBoostingClassifier
gbc = GradientBoostingClassifier(n_estimators=100,max_depth=6)
gbc.fit(X_train,y_train)
gbc_pred = gbc.predict(X_val)
gbc_pred_proba = gbc.predict_proba(X_val)
print(confusion_matrix(y_pred=gbc_pred,y_true=y_val))
print('log loss: {}'.format(log_loss(y_pred=np.clip(gbc_pred_proba,a_max=0.999,a_min=0.001),y_true=pd.get_dummies(y_val-1))))
sns.heatmap(confusion_matrix(y_pred=gbc_pred,y_true=y_val),cmap='Greens',xticklabels=range(1,10),yticklabels=range(1,10))
print('classification report results:\r\n' + classification_report(y_pred=gbc_pred,y_true=y_val))
In [70]:
%%time
import xgboost as xgb
dtrain = xgb.DMatrix(data=X_train,label=y_train-1) #xgb classes starts from zero
dval = xgb.DMatrix(data=X_val,label=y_val-1) #xgb classes starts from zero
watchlist = [ (dtrain,'train'),(dval,'eval'),]
xgb_params = {
'eta': 0.05,
'max_depth': 7,
'subsample': 0.9,
'colsample_bytree': 0.9,
'colsample_bylevel': 0.7,
'lambda':0.1,
'objective': 'multi:softmax',
'eval_metric': 'mlogloss',
'min_child_weight':2,
'num_class' : 9
}
bst = xgb.train(params=xgb_params,dtrain=dtrain,num_boost_round=400,evals=watchlist,verbose_eval=10)
In [71]:
xgb_pred = bst.predict(dval)
xgb_params = {
'eta': 0.05,
'max_depth': 7,
'subsample': 0.9,
'colsample_bytree': 0.9,
'colsample_bylevel': 0.7,
'lambda':0.1,
'objective': 'multi:softprob',
'eval_metric': 'mlogloss',
'min_child_weight':2,
'num_class' : 9
}
bst = xgb.train(params=xgb_params,dtrain=dtrain,num_boost_round=400,evals=watchlist,verbose_eval=10)
In [72]:
xgb_pred_proba = bst.predict(dval)
print(confusion_matrix(y_pred=xgb_pred,y_true=y_val))
print('log loss: {}'.format(log_loss(y_pred=np.clip(xgb_pred_proba,a_max=0.999,a_min=0.001),y_true=pd.get_dummies(y_val-1))))
sns.heatmap(confusion_matrix(y_pred=xgb_pred+1,y_true=y_val),cmap='Greens',xticklabels=range(1,10),yticklabels=range(1,10))
print('classification report results:\r\n' + classification_report(y_pred=xgb_pred+1,y_true=y_val))
wow! we got an average F1 score of 82% this looks great!
lets predict the results on the test set using the gradient boosting model and create a submission to the kaggle platform
In [ ]:
test_pred = bst.predict(xgb.DMatrix(te_data.iloc[:,1:]))
In [ ]:
In [ ]:
subm = pd.DataFrame(test_pred)
subm.columns = ['class_'+ str(x) for x in range(1,10)]
subm.index = te_data.id
subm.to_csv('../subm/xgboost_classification_submission.csv')
In [ ]:
#lets make sure our prediction fits the desired format:
print(subm.head())
print('submission shape: {}'.format(subm.shape))
print('')
print("great! we're good to go on and submit our results")